knitr::opts_chunk$set(fig.width=12, fig.height=8)
Error in loadNamespace(name) : there is no package called ‘knitr’
knitr::opts_chunk$set(fig.width=12, fig.height=8)
library(tidyverse)
library(descr)
library(Amelia)
library(lubridate)
library(ggmap)
library(quantreg)
library(shapefiles)
library(leaflet)
library(stringi)
library(ngram)
library(rgdal)
library(plotly)
library(data.table)
business <- read_csv("yelp_toronto_business.csv")
review <- read_csv("yelp_toronto_review.csv")
Our focus will be on the business data, but we will use the review data to get some insights about businesses and will do some feature engineering to create useful variables for the model.
glimpse(review)
Observations: 474,803
Variables: 9
$ review_id <chr> "kS4hrhEScwB9V5JATYjvVQ", "YDJDfKnx6VpMMo4EBxycGg", "2Hk7DNwu3...
$ user_id <chr> "hxqo4NyJFfeOmuoVi--s1A", "FCtoTo9zSH1cSAkascfEHw", "YHWsLBS8j...
$ business_id <chr> "f5O7v_X_jCg2itqacRfxhg", "7xA6iSP0Ndn08tpBFQtUKA", "SmizR7MLt...
$ stars <int> 5, 1, 4, 4, 2, 3, 2, 3, 4, 4, 2, 5, 4, 5, 4, 1, 1, 4, 3, 3, 3,...
$ date <date> 2017-10-12, 2017-05-22, 2011-06-01, 2011-11-07, 2011-08-20, 2...
$ text <chr> "Sansotei serves some top notch ramen. They take no reservatio...
$ useful <int> 0, 0, 1, 4, 0, 2, 9, 3, 2, 5, 8, 16, 0, 7, 8, 1, 5, 1, 3, 2, 2...
$ funny <int> 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 0, 0, 1, 2, 0, 1, 2, 0,...
$ cool <int> 0, 0, 1, 0, 0, 1, 4, 2, 1, 3, 1, 9, 0, 0, 3, 0, 1, 2, 2, 1, 2,...
# Are there any missing values in the review data?
paste("There is :", sum(is.na(review)), "missing values in the review data.")
[1] "There is : 0 missing values in the review data."
Sounds good, there are no missing values in the review dataset. Later we will check if some variable will help for building our model. For that data set, we will select later 4 features (‘business_id’, ‘stars’, ‘text’, ‘date’) and try to create new features and add them to the initial business data.
#Create a new variable containing the number of word and characters in reviews
review <- review %>%
mutate(no_characters = nchar(review$text), no_words = str_count(review$text, "\\w+")) #%>%
#group_by(business_id, word_count) %>%
#arrange(desc(word_count))
Now let check the business data set and respond to the questions below:
Does the data contain some missing values? What is the ditribution of the variables? How are the different variables related to each other?
# Business data structure
glimpse(business)
Observations: 18,233
Variables: 13
$ address <chr> "631 Bloor St W", "595 Markham Street", "746 Street Clair Ave...
$ business_id <chr> "9A2quhZLyWk0akUetBd8hQ", "tZnSodhPwNr4bzrwJ1CSbw", "5J3b7j3F...
$ categories <chr> "Food, Bakeries", "Cajun/Creole, Southern, Restaurants", "Foo...
$ city <chr> "Toronto", "Toronto", "Toronto", "Toronto", "Toronto", "Toron...
$ is_open <int> 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1...
$ latitude <dbl> 43.66438, 43.66412, 43.68133, 43.67088, 43.73340, 43.66196, 4...
$ longitude <dbl> -79.41442, -79.41189, -79.42788, -79.39238, -79.22421, -79.39...
$ name <chr> "Bnc Cake House", "Southern Accent Restaurant", "Mabel's Bake...
$ neighborhood <chr> "Koreatown", "Palmerston", "Wychwood", "Yorkville", "Scarboro...
$ postal_code <chr> "M6G 1K8", "M6G 2L7", "M6C 1B5", "M5R 3K5", "M1M 1P8", "M5S",...
$ review_count <int> 7, 146, 23, 25, 3, 3, 3, 105, 3, 31, 9, 3, 51, 5, 6, 4, 15, 3...
$ stars <dbl> 4.0, 4.0, 4.0, 3.5, 2.0, 4.5, 4.5, 4.0, 3.5, 4.0, 4.0, 1.0, 3...
$ state <chr> "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "ON", "...
# let visualize the percentage of missing values for each feature
missing <- data.table(pmiss = sapply(business, function(x) { (sum(is.na(x)) / length(x)) }),
column = names(business))
p <- ggplot(missing,aes(x = reorder(column, -pmiss), y = pmiss)) +
geom_bar(stat = 'identity', fill = 'steelblue') +
scale_y_continuous(labels = scales::percent) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
x = 'Feature',
y = '% missing',
title = "Missing data by feature"
)
ggplotly(p)
We have that summary for the business data: 18,233 Observations and 13 Variables
$ address character class with 14422 unique values and 283 missing values
$ business_id character class with 18233 unique values and 0 missing values
$ categories character class with 10028 unique values and 33 missing values
$ city character class with 1 unique values and 0 missing values
$ is_open integer class with 2 unique values and 0 missing values
$ latitude numeric class with 15366 unique values and 1 missing values
$ longitude numeric class with 15315 unique values and 1 missing values
$ name character class with 15292 unique values and 0 missing values
$ neighborhood character class with 80 unique values and 3435 missing values
$ postal_code character class with 5261 unique values and 117 missing values
$ review_count integer class with 380 unique values and 0 missing values
$ stars numeric class with 9 unique values and 0 missing values
$ state character class with 2 unique values and 0 missing values
The neighborhood variable have the most missing values (19%).
# correcting coordinates
business[which(business$address == "2138 Queen Street E"),] <- business %>%
filter(business$address == "2138 Queen Street E") %>%
mutate(longitude = -79.293425, latitude = 43.671584)
#adding coordinates for the postal code M5H 4G1 (lon: -79.3854, lat: 43.6508)
business[which(is.na(business$longitude) == TRUE),] <- business %>%
filter(is.na(longitude) == TRUE) %>%
mutate(longitude = -79.3854, latitude = 43.6508)
# converting "is_open" attribute from integer to factor and change values to yes or no
#lowering some character variable, converting "stars" from int to factor
business <- business %>%
mutate(categories = str_to_lower(categories),
name = str_trim(str_to_lower(name),side = 'both'),
neighborhood = str_to_lower(neighborhood),
)
Let check the distribution in the target variable. _______________ How many businesses are open? How many are closed?
# Target variable distribution
p <- ggplot(business, aes(x = as.factor(is_open))) +
geom_bar(aes(fill = as.factor(is_open)))
ggplotly(p)
14023 businesses are open (77%) and 4210 closed (23%). Our target variable is imbalanced.
Does business closure depend on rating?
# visualizing the number of business per rating star
p1 <- ggplotly(
ggplot(business, mapping = aes(as.factor(stars))) +
geom_bar(fill = 'darkblue') +
labs( x = "Star rating", y = "Number of businesses", title ="Star rating distribution business open/closed")
)
p2 <- ggplotly(
ggplot(business, mapping = aes(as.factor(stars), fill = as.factor(is_open))) +
geom_bar(position = "dodge") +
labs( x = "Star rating", y = "Number of businesses", title ="Star rating distribution business open/closed")
)
subplot(p1, p2, nrows = 1, margin = 0.02, shareX = TRUE, shareY = TRUE, titleX = TRUE, titleY = TRUE)
#ggsave("stars_dist2.png", width = 5, height = 5)
Around 70% of the businesses got a rating between 3 and 4.5. From the graph above, we can gather that the distribution of businesses per rating has the same shape for ‘open’ businesses and ‘closed’ businesses. One assumption we can make is that ratings has no impact on a business closure. We will confirm that or not later.
Let visualize if “closed” Businesses received less reviews, and if rating are correlated with the number of reviews.
# The distribution of reviews
#we have applied log since the distribution is skewed
p1 <- ggplotly(
ggplot(data = business, aes(x = review_count)) +
geom_histogram(bins = 50,binwidth = 0.1, fill = 'darkgreen') +
scale_x_log10() +
scale_y_log10() +
labs(title ="Distribution of reviews")
)
Transformation introduced infinite values in continuous y-axis
p2 <- ggplotly(
ggplot(data = business, aes(x = review_count, fill = as.factor(is_open))) +
geom_histogram(bins = 50,binwidth = 0.1, position = 'dodge') +
scale_x_log10() +
scale_y_log10() +
labs(title ="Distribution of reviews")
)
Transformation introduced infinite values in continuous y-axis
subplot(p1, p2, nrows = 1, margin = 0.02, shareX = TRUE, shareY = TRUE, titleX = TRUE, titleY = TRUE)
#ggsave("reviews_dist.png", width = 10, height = 5)
# visualizing the number of review per rating star
p1 <- ggplotly(
business %>%
select(stars, review_count) %>%
group_by(stars) %>%
summarize(review_count = sum(review_count)) %>%
ggplot(stars, mapping = aes(as.factor(stars), review_count)) +
geom_col(fill = 'darkgreen') +
labs( x = "Star rating", y = "Number of reviews", title ="Number of reviews per rating star")
)
p2 <- ggplotly(
business %>%
select(stars, is_open, review_count) %>%
group_by(stars, is_open) %>%
summarize(review_count = sum(review_count))%>%
ggplot(business, mapping = aes(as.factor(stars), review_count, fill = as.factor(is_open))) +
geom_col(position = 'dodge') +
labs( x = "Star rating", y = "Number of reviews", title ="Number of reviews per rating star")
)
subplot(p1, p2, nrows = 2, margin = 0.02, shareX = TRUE, shareY = TRUE, titleX = TRUE, titleY = TRUE)
NA
From the graph above, we can gather that the distribution shape of number of reviews per stars is quite the same for ‘open’ businesses and ‘closed’ businesses. As the target variable is not balanced, it’s not a supprise that for each rating we observed a significant difference in term of number of reviews.
center_long = median(business$longitude, na.rm = TRUE)
center_lat = median(business$latitude, na.rm = TRUE)
leaflet(business) %>%
addTiles()%>%
#addProviderTiles("Esri.NatGeoWorldMap") %>%
addCircles(lng = ~ longitude, lat = ~latitude, radius = ~sqrt(review_count)) %>%
setView(lng = center_long, lat = center_lat, zoom = 10)
NA
In the Great Toronto Area (GTA), most of the businesses are located in Downtown Toronto.
“open” Businesses
center_long = median(business$longitude, na.rm = TRUE)
center_lat = median(business$latitude, na.rm = TRUE)
leaflet(business %>% filter(is_open == 'yes')) %>%
addTiles()%>%
#addProviderTiles("Esri.NatGeoWorldMap") %>%
addCircles(lng = ~ longitude, lat = ~latitude, radius = ~sqrt(review_count), color = 'green') %>%
setView(lng = center_long, lat = center_lat, zoom = 10)
center_long = median(business$longitude, na.rm = TRUE)
center_lat = median(business$latitude, na.rm = TRUE)
leaflet(business %>% filter(is_open == 'yes')) %>%
addTiles()%>%
#addProviderTiles("Esri.NatGeoWorldMap") %>%
addCircles(lng = ~ longitude, lat = ~latitude, radius = ~sqrt(review_count), color = 'green') %>%
setView(lng = center_long, lat = center_lat, zoom = 10)
“closed” Businesses
center_long = median(business$longitude, na.rm = TRUE)
center_lat = median(business$latitude, na.rm = TRUE)
leaflet(business %>% filter(is_open == 'no')) %>%
addTiles()%>%
#addProviderTiles("Esri.NatGeoWorldMap") %>%
addCircles(lng = ~ longitude, lat = ~latitude, radius = ~sqrt(review_count), color = 'purple') %>%
setView(lng = center_long, lat = center_lat, zoom = 10)
We will check more by neihgbohood to get if a specificlocation can impact a categories of business.
Let look at the leading business categories in our data set. A business is linked to multiple categories in our dataset, so we have to do a bit of preprocessing, which is simple using the dplyr package.
# Top categories of business in GTA
categorie <- business %>%
unnest(categories = str_split(categories, ",")) %>%
mutate(categories = str_trim(categories,side = "both")) %>%
select(categories) %>%
group_by(categories) %>%
summarise(n=n()) %>%
arrange(desc(n)) %>%
head(25)
p <- ggplot(categorie, aes(x = reorder(categories, n), y = n)) +
geom_col(aes(fill = n)) +
scale_fill_gradientn(colours=RColorBrewer::brewer.pal(11,"Spectral")) +
coord_flip() +
labs(title ="Top 25 categories")
ggplotly(p)
There are 891 different categories of Businesses reviewed in Yelp and more than 60% of the rated businesses have their activities related to food. __________________
As in the data set each business is linked to multiple categories, we will create a new feature to count the number of categories per business.
# Create a new variable to gather the number of categories for each business
b_categorie <- business %>%
unnest(categories = str_split(categories, ",")) %>%
mutate(categories = str_trim(categories,side = "both")) %>%
group_by(business_id, name) %>%
summarize(categorie_count = n()) %>%
arrange(desc(categorie_count))
#Adding a new variable 'categorie_count' to the business data
business <- full_join(business, b_categorie)
Joining, by = c("business_id", "name")
p <- ggplot(b_categorie %>% head(15), aes(x = reorder(name, desc(categorie_count)), y = categorie_count)) +
geom_col(aes(fill = categorie_count)) +
scale_fill_gradientn(colours=rainbow(11)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
x = 'Buiness name',
title = "Top 15 businesses with mutiple categories"
)
ggplotly(p)
NA
NA
** distribution of categorie count**
p1 <- ggplotly(
ggplot(data = business, aes(x = as.factor(categorie_count))) +
geom_bar(fill = 'darkgreen') +
labs(title ="Distribution of number of categories", y = "Number of businesses")
)
p2 <- ggplotly(
ggplot(data = business, aes(x = as.factor(categorie_count), fill = as.factor(is_open))) +
geom_bar(position = 'dodge') +
labs(title ="Distribution of number of categories", x = "Number of categories", y = "Number of businesses")
)
subplot(p1, p2, nrows = 2, margin = 0.02, shareX = TRUE, shareY = TRUE, titleX = TRUE, titleY = TRUE)
p1 <- ggplotly(
ggplot(data = business, aes(x = as.factor(stars), y = categorie_count)) +
geom_boxplot(fill = 'darkgreen') +
labs(title ="Distribution of number of categories")
)
p2 <- ggplotly(
ggplot(data = business, aes(x = as.factor(stars), y = categorie_count)) +
geom_boxplot(aes(colour = as.factor(is_open))) +
labs(title ="Distribution of number of categories", x = "Number of categories", y = "Number of businesses")
)
p3 <- ggplotly(
ggplot(data = business, aes(x = as.factor(is_open), categorie_count)) +
geom_boxplot(fill = 'darkgreen', ) +
labs(title ="Distribution of number of categories")
)
subplot(p1, p2, nrows = 1, margin = 0.05, shareX = TRUE, shareY = TRUE, titleX = TRUE, titleY = TRUE)
review_count and categorie_count
ggplotly(
ggplot(business, aes(x = categorie_count, y=review_count)) +
geom_point(aes(fill = as.factor(is_open)))
)
NA
NA
business_name <- business %>%
select(name) %>%
group_by(name) %>%
summarise(n=n()) %>%
arrange(desc(n)) %>%
head(15)
p <- ggplot(business_name, aes(x = reorder(name, n), y = n)) +
geom_col(aes(fill = n)) +
scale_fill_gradientn(colours=RColorBrewer::brewer.pal(11,"Spectral")) +
coord_flip() +
labs(title ="Top 15 business names")
ggplotly(p)
missing <- data.table(pmiss = sapply(business, function(x) { (sum(is.na(x)) / length(x)) }),
column = names(business))
p <- ggplot(missing,aes(x = reorder(column, -pmiss), y = pmiss)) +
geom_bar(stat = 'identity', fill = 'steelblue') +
scale_y_continuous(labels = scales::percent) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
x = 'Feature',
y = '% missing',
title = "Missing data by feature"
)
ggplotly(p)
select_cols= c('is_open', 'review_count', 'stars', 'categorie_count')
data = business %>% select(select_cols)
The function ‘createDataPartition’ from the caret package* will be used to create a stratified random sample of the data into training and test sets.
# Let's split the data dataset into training and test set.
library(caret)
set.seed(100000)
#data_index <- sample(nrow(data), floor(nrow(data)*0.7))
data_index <- createDataPartition(data$is_open, p = .6, list = FALSE, times = 1)
train.set <- data[data_index,]
test.set <- data[-data_index,]
# Logistic Regression Model
logreg <- glm(is_open~., data = train.set, family = binomial(link='logit'))
#summary(logreg)
# Performance Evaluation: Confusion Matrix
library(e1071)
pred_logreg <- ifelse(predict(logreg, test.set[-1], type="response") >= 0.5, 1, 0)
confusionMatrix_logreg <- table(actual = test.set$is_open, predicted = pred_logreg)
confusionMatrix_logreg
accuracy <- sum(diag(confusionMatrix_logreg))/nrow(test.set)
print(paste('Accuracy =',accuracy))
#install.packages("ROCR")
library(ROCR)
pr <- prediction(pred_logreg, test.set$is_open)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
print(paste('AUC =',auc))
#install.packages('randomForest')
library(randomForest)
set.seed(100000)
rf <- randomForest(is_open ~ .,
data=train.set,
importance=TRUE,
ntree=2000)
varImpPlot(rf)
# Performance Evaluation: Confusion Matrix
predicted_rf <- ifelse(predict(rf, test.set[-1]) >= 0.7, 1, 0)
confusionMatrix_rf <- table(actual = test.set$is_open, predicted = predicted_rf)
confusionMatrix_rf
accuracy <- sum(diag(confusionMatrix_rf))/nrow(test.set)
print(paste('Accuracy =',accuracy))
pr <- prediction(predicted_rf, test.set$is_open)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
print(paste('AUC =',auc))
library('partykit')
set.seed(100000)
mdl_cf <- cforest(as.factor(is_open) ~.,
data = train.set,
ntree=2000, mtry=3)
# Performance Evaluation: Confusion Matrix
pred_cf <- predict(mdl_cf, test.set[-1], OOB=TRUE, type = 'response')
confusionMatrix_cf <- table(actual = test.set$is_open, predicted = pred_cf)
confusionMatrix_cf
accuracy <- sum(diag(confusionMatrix_cf))/nrow(test.set)
print(paste('Accuracy =',accuracy))
pr <- prediction(pred_cf, test.set$is_open)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
print(paste('AUC =',auc))
train.set <- train.set %>%
mutate(is_open = as.factor(is_open))
test.set <- test.set %>%
mutate(is_open = as.factor(is_open))
CrossTable(train.set$is_open)
CrossTable(test.set$is_open)
library(rpart)
treeimb <- rpart(is_open ~ ., data = train.set)
pred.treeimb <- predict(treeimb, newdata = test.set)
accuracy.meas(test.set$is_open, pred.treeimb[,2])
roc.curve(test.set$is_open, pred.treeimb[,2], plotit = F)
data_balanced_over <- ovun.sample(is_open ~ ., data = train.set, method = "over",N = 11200)$data
table(data_balanced_over$is_open)
data_balanced_under <- ovun.sample(is_open ~ ., data = train.set, method = "under", N = 3386, seed = 1)$data
table(data_balanced_under$is_open)
data_balanced_both <- ovun.sample(is_open ~ ., data = train.set, method = "both", p=0.5,
N=7293, seed = 1)$data
#p refers to the probability of positive class in newly generated sample.
table(data_balanced_both$is_open)
data.rose <- ROSE(is_open ~ ., data = train.set, seed = 1)$data
table(data.rose$is_open)
#build decision tree models
tree.rose <- rpart(is_open ~ ., data = data.rose)
tree.over <- rpart(is_open ~ ., data = data_balanced_over)
tree.under <- rpart(is_open ~ ., data = data_balanced_under)
tree.both <- rpart(is_open ~ ., data = data_balanced_both)
#make predictions on unseen data
pred.tree.rose <- predict(tree.rose, newdata = test.set)
pred.tree.over <- predict(tree.over, newdata = test.set)
pred.tree.under <- predict(tree.under, newdata = test.set)
pred.tree.both <- predict(tree.both, newdata = test.set)
#AUC ROSE
roc.curve(test.set$is_open, pred.tree.rose[,2])
#AUC Oversampling
roc.curve(test.set$is_open, pred.tree.over[,2])
#AUC Undersampling
roc.curve(test.set$is_open, pred.tree.under[,2])
#AUC Both
roc.curve(test.set$is_open, pred.tree.both[,2])
ROSE.holdout <- ROSE.eval(is_open ~ ., data = train.set, learner = rpart, method.assess = "holdout", extr.pred = function(obj)obj[,2], seed = 1)
ROSE.holdout
ROSE.boot <- ROSE.eval(is_open ~ ., data = train.set, learner = rpart, method.assess = "BOOT", extr.pred = function(obj)obj[,2], seed = 1)
ROSE.boot